Created filtered CSVs and static logo plots¶

InĀ [1]:
# Imports 
import os
import warnings
import dmslogo
import pandas as pd
import seaborn as sns
import matplotlib.colors
import matplotlib.pyplot as plt

# Plotting colors
tol_muted_adjusted = [
    "#000000",
    "#CC6677", 
    "#1f78b4", 
    "#DDCC77", 
    "#117733", 
    "#882255", 
    "#88CCEE",
    "#44AA99", 
    "#999933", 
    "#AA4499", 
    "#DDDDDD",
]

# Create color palette
def color_gradient_hex(start, end, n):
    """Color function from polyclonal"""
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
            name="_", colors=[start, end], N=n
        )
    return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]

# Seaborn style settings
sns.set(rc={
    "figure.dpi":300, 
    "savefig.dpi":300,
    "svg.fonttype":"none",
})
sns.set_style("ticks")
sns.set_palette(tol_muted_adjusted)

# Suppress warnings
warnings.simplefilter("ignore")

File paths for data:

InĀ [2]:
# this cell is tagged as `parameters` for papermill parameterization
func_scores = None
antibody_escape_dir = None

filtered_csv_dir = None
escape_plots_dir = None

times_seen = None
n_selections = None
frac_models = None
cell_entry_default = None

filtered_func_scores = None
InĀ [3]:
# Parameters
func_scores = "results/func_effects/averages/293T_entry_func_effects.csv"
filtered_func_scores = "results/filtered_CSVs/293T_entry_filtered_mut_effects.csv"
antibody_escape_dir = "results/antibody_escape/averages/"
filtered_csv_dir = "results/filtered_CSVs/"
escape_plots_dir = "results/escape_plots/"
times_seen = 2
n_selections = 8
frac_models = 1
cell_entry_default = -3
InĀ [4]:
# # Uncomment for running interactive
# func_scores = "../results/func_effects/averages/293T_entry_func_effects.csv"
# antibody_escape_dir = "../results/antibody_escape/averages/"

# filtered_csv_dir = "../results/filtered_CSVs/"
# escape_plots_dir = "../results/escape_plots/"

# times_seen = 2
# n_selections = 8
# frac_models = 1
# cell_entry_default = -3

# filtered_func_scores = "../results/filtered_CSVs/293T_entry_filtered_mut_effects.csv"

Process antibody escape data and create both a filtered CSV file and antibody escape plot

InĀ [5]:
# Initialize list of antibodies and sites for further analysis
GPC_A_akin_antibodies = [
    "S43711",
    "S44428",
    "S44446",
    "S44478",
]
GPC_A_akin_escape_sites = []

GPC_A_hybrid_antibodies = [
    "S43720",
    "S44433",
]
GPC_A_hybrid_escape_sites = []

GPC_C_hybrid_antibodies = [
    "S4378",
    "S43727",
]
GPC_C_hybrid_escape_sites = []

def get_filtered_csv_and_logo_plot(
    antibody_name,  
    func_effects_file, 
    min_times_seen, 
    min_func_score, 
    min_n_selections, 
    min_frac_models,
    filtered_file_dir = None,
    floor = False,
    sites = None,
    num_sites = 20,
    color_contact_sites = False,
    contacts_file = None,
    plots_dir = None,
    ax = [],
):
    """
    Function that filters and writes an antibody escape csv.
    Also creates summed escape profiles and logoplots.
    """

    # Load data
    escape_df = pd.read_csv(antibody_escape_dir+antibody_name+"_mut_effect.csv")
    func_effects = pd.read_csv(func_effects_file)

    # Create mutation column to match antibody df
    # **
    # Pipeline does not filter functional scores 
    # prior to filtering antibody selections
    # **
    func_effects["site"] = func_effects["site"].astype(str)
    func_effects["mutation"] = func_effects["wildtype"] + func_effects["site"] + func_effects["mutant"]
    func_scores_filter = func_effects.loc[func_effects["effect"] >= min_func_score]["mutation"].tolist()

    # Filter escape df for min times seen
    # min fraction of models, and no stop codons
    escape_df = (
        escape_df.loc[
            (escape_df["times_seen"] >= min_times_seen)
            &
            (escape_df["frac_models"] >= min_frac_models)
            &
            (escape_df["mutant"] != "*")
        ]
        .rename(columns={"escape_median" : "escape"})
        .reset_index(drop=True)
    )

    # Drop all but a few columns
    escape_df = escape_df[[
            "site", 
            "wildtype", 
            "mutant", 
            "mutation", 
            "escape",
    ]]

    # Mark mutations that are below functional cutoff
    # and replace measurements with NaN
    escape_df["poor_cell_entry"] = (
        escape_df["mutation"].apply(lambda x: False if x in func_scores_filter else True)
    )
    escape_df["escape"] = (
        escape_df.apply(lambda x: float("NaN") if (x["poor_cell_entry"] == True) else x["escape"], axis=1)
    )

    # Create a floored escape column
    escape_df["floored_escape"] = escape_df["escape"].clip(lower=0)

    # Write filtered escape to csv
    if filtered_file_dir != None:
        escape_df.to_csv(filtered_file_dir+antibody_name+"_filtered_mut_effects.csv", index=False)

    # Show only positive escape values
    if floor == True:
        escape_df["escape"] = escape_df["floored_escape"]

    # Add site escape column
    summed_df = (
        escape_df
        .groupby(["site", "wildtype"])
        .aggregate({
            "escape" : "sum",
        })
        .rename(columns={"escape" : "site_escape"})
        .reset_index()
    )
    escape_df = (
        escape_df.merge(
            summed_df,
            how="left",
            on=["site", "wildtype"],
            validate="many_to_one",
        )
    )
    
    # If not sites are given, then show top summed escape sites
    if sites == None:
        sites = sorted(summed_df.nlargest(num_sites, "site_escape")["site"].tolist())
        if antibody_name in GPC_A_akin_antibodies:
            GPC_A_akin_escape_sites.append(sorted(summed_df.nlargest(10, "site_escape")["site"].tolist()))
        if antibody_name in GPC_A_hybrid_antibodies:
            GPC_A_hybrid_escape_sites.append(sorted(summed_df.nlargest(25, "site_escape")["site"].tolist()))
        if antibody_name in GPC_C_hybrid_antibodies:
            GPC_C_hybrid_escape_sites.append(sorted(summed_df.nlargest(25, "site_escape")["site"].tolist()))
            

    # Mark escape sites for each antibody to show
    escape_df["show_site"] = escape_df.apply(lambda x: True if x["site"] in sites else False, axis=1)

    # Shade contact sites in logo plot
    if color_contact_sites == True:
        # **
        # Currently there are no known structures for these antibodies
        contacts_df = pd.read_csv(contacts_file)
        shade_sites = list(contacts_df.loc[contacts_df["distance"] == 4]["position"].unique())
        escape_df["shade_site"] = escape_df.apply(lambda x: "#DDCC77" if x["site"] in shade_sites else None, axis=1)
        # **
    else:
        escape_df["shade_site"] = None
    escape_df["shade_alpha"] = 0.35

    # Merge functional and escape dfs
    func_effects["site"] = func_effects["site"].astype("int")
    func_effects = func_effects.loc[func_effects["mutant"] != "*"] # remove stop codons
    func_effects["effect"] = func_effects["effect"].clip(upper=0, lower=(min_func_score + (min_func_score*0.05))) # clip scores 
    escape_df = (
        escape_df.merge(
            func_effects[[
                "site", 
                "wildtype", 
                "mutant", 
                "mutation",
                "effect",
            ]],
            how="left",
            on=["site", "wildtype", "mutant", "mutation"],
            validate="one_to_one",
        )
    )
    # Missing functional values are filled as min to make less visible
    escape_df["effect"] = escape_df["effect"].fillna(min_func_score + (min_func_score*0.05)) 
    
    # Add color column for logo plots
    func_color_map = dmslogo.colorschemes.ValueToColorMap(
        minvalue=func_effects["effect"].min(),
        maxvalue=func_effects["effect"].max(),
        cmap=matplotlib.colors.ListedColormap(color_gradient_hex("white", "#000000", n=20))
    )
    escape_df = (
        escape_df.assign(
            color=lambda x: x["effect"].map(func_color_map.val_to_color)
        )
    )

    # Add wildtype to each site for logo plot
    escape_df["wt_site"] = escape_df["wildtype"] + escape_df["site"].map(str)

    # Fill in missing sites because logo plot requires
    # a sequential list of sites
    seen_list = [False]*491
    for index in range(len(escape_df.index)):
        site = escape_df.at[index, "site"] - 1 
        seen_list[site] = True
    for index, seen in enumerate(seen_list):
        if seen == False:
            site = index + 1
            # Add missing sites
            escape_df.loc[len(escape_df.index)] = [
                site,
                "X",
                "X",
                "X"+str(site)+"X",
                float("nan"),
                False,
                float("nan"),
                float("nan"),
                False,
                None,
                0.35,
                min_func_score + (min_func_score*0.05),
                "#ffffff",
                "X"+str(site),
            ]

    # Sort by site
    escape_df = (
        escape_df
        .sort_values(by="site")
        .astype({"mutant" : "str"})
        .reset_index(drop=True)
    )

    # Create fig if no ax passed to function
    if len(ax) == 0:
        # Set figure size and subplots
        fig, axes = plt.subplots(
            1, 
            3, 
            gridspec_kw={"width_ratios":[1,3,0.1]},
            figsize=(12, 2.5), 
        )
        
        # Adjust spacing of subplots
        fig.subplots_adjust(
            wspace=0.3, 
            top=0.8,
            bottom=0.3,
        )

        # Plot escape profiles and logo plots
        # Lineplot
        _, lineplot = dmslogo.draw_line(
            escape_df,
            x_col="site",
            height_col="site_escape",
            show_col="show_site",
            ax=axes[0],
            show_color="#CC6677",
            linewidth=1,
        )
        # Logoplot
        _, logoplot = dmslogo.draw_logo(
            escape_df.query("show_site == True"),
            x_col="site",
            letter_col="mutant",
            letter_height_col="escape",
            ax=axes[1],
            xtick_col="wt_site",
            color_col="color",
            shade_color_col="shade_site",
            shade_alpha_col="shade_alpha",
            draw_line_at_zero="if_negative",
        )
    else:
        # Plot escape profiles and logo plots
        # Lineplot
        _, lineplot = dmslogo.draw_line(
            escape_df,
            x_col="site",
            height_col="site_escape",
            show_col="show_site",
            ax=ax[0],
            show_color="#CC6677",
            linewidth=1,
        )
        # Logoplot
        _, logoplot = dmslogo.draw_logo(
            escape_df.query("show_site == True"),
            x_col="site",
            letter_col="mutant",
            letter_height_col="escape",
            ax=ax[1],
            xtick_col="wt_site",
            color_col="color",
            shade_color_col="shade_site",
            shade_alpha_col="shade_alpha",
            draw_line_at_zero="if_negative",
        )

    # Format lineplot
    lineplot.set(ylabel="escape", xlabel=None)
    lineplot.set_xlim(1,491)
    xticks = [100, 200, 300, 400]
    lineplot.set_xticks(xticks)
    x_labels = [
        "100", 
        "200",
        "300",
        "400",
    ]
    lineplot.set_xticklabels(labels=x_labels, rotation=90, horizontalalignment="center")
    lineplot.set_title(
        antibody_name[0:4] + "." + antibody_name[4:], 
    )

    # Format logoplot
    logoplot.set(ylabel=None, xlabel=None)
    logoplot.set_title(
        "antibody " + antibody_name[0:4] + "." + antibody_name[4:], 
    )

    if len(ax) == 0:
        # Create color bar for func effects
        _, ax_scale = func_color_map.scale_bar(
            orientation="vertical", 
            ax=axes[2],
        )
    else:
        # Create color bar for func effects
        _, ax_scale = func_color_map.scale_bar(
            orientation="vertical", 
            ax=ax[2],
        )
        
    ax_scale.set_yticks([-3.0, -2.5, -2.0, -1.5, -1.0, -0.5, 0])
    scale_labels = ["-3.0", "-2.5", "-2.0", "-1.5", "-1.0", "-0.5", "≄0"]
    ax_scale.set_yticklabels(labels=scale_labels)
    ax_scale.set_title("effect on\ncell entry")

    if plots_dir != None:
        fig.savefig(plots_dir + antibody_name + "_escape_plot.svg")
InĀ [6]:
antibodies = [
    "S4378",
    "S43711",
    "S43720",
    "S43727",
    "S43742",
    "S43752",
    "S44428",
    "S44433",
    "S44446",
    "S44478",
]

for antibody in antibodies:

    # Make output dir if doesn't exist
    if not os.path.exists(filtered_csv_dir):
        os.mkdir(filtered_csv_dir)

    # Make output dir if doesn't exist
    if not os.path.exists(escape_plots_dir):
        os.mkdir(escape_plots_dir)
    
    get_filtered_csv_and_logo_plot(
        antibody,
        func_scores,
        times_seen,
        cell_entry_default,
        n_selections,
        frac_models,
        filtered_file_dir = filtered_csv_dir,
        floor = True,
        plots_dir = escape_plots_dir,
    )
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Escape summary plot for GPC-A akin antibodies

InĀ [7]:
# Flatten list of escape sites
GPC_A_escape_sites = [x for xs in GPC_A_akin_escape_sites for x in xs]

# Combined top sites from each antibody
sites_to_show = list(set(GPC_A_escape_sites))
print("Escape sites to show for GPC-A antibodies")
print(sites_to_show)

# Set figure size and subplots
fig, axes = plt.subplots(
    4, 
    3, 
    gridspec_kw={"width_ratios":[1,3,0.1]},
    figsize=(12, 12), 
    sharex="col",
)

# Adjust spacing of subplots
fig.subplots_adjust(
    wspace=0.3, 
    hspace=0.35,
    top=0.8,
    bottom=0.3,
)

for index,antibody in enumerate(GPC_A_akin_antibodies):
    get_filtered_csv_and_logo_plot(
        antibody,
        func_scores,
        times_seen,
        cell_entry_default,
        n_selections,
        frac_models,
        floor = True,
        sites = sites_to_show,
        ax = axes[index]
    )

# Make output dir if doesn't exist
if not os.path.exists(escape_plots_dir + "subset_analysis_plots/"):
    os.mkdir(escape_plots_dir + "subset_analysis_plots/")
    
# Save figure    
fig.savefig(escape_plots_dir + "subset_analysis_plots/" + "GPC-A_antibodies_escape_plot.svg")
Escape sites to show for GPC-A antibodies
[134, 135, 143, 160, 74, 76, 79, 88, 96, 225, 226, 227, 228, 229, 102, 230, 224, 236, 367]
No description has been provided for this image

Escape summary plot for GPC-A hybrid antibodies

InĀ [8]:
# Flatten list of escape sites
escape_sites = [x for xs in GPC_A_hybrid_escape_sites for x in xs]

# Combined top sites from each antibody
sites_to_show = list(set(escape_sites))
print("Escape sites to show for GPC-A hybrid antibodies")
print(sites_to_show)

# Set figure size and subplots
fig, axes = plt.subplots(
    2, 
    3, 
    gridspec_kw={"width_ratios":[1,4,0.1]},
    figsize=(16, 6), 
    sharex="col",
)

# Adjust spacing of subplots
fig.subplots_adjust(
    wspace=0.25, 
    hspace=0.35,
    top=0.8,
    bottom=0.3,
)

for index,antibody in enumerate(GPC_A_hybrid_antibodies):
    get_filtered_csv_and_logo_plot(
        antibody,
        func_scores,
        times_seen,
        cell_entry_default,
        n_selections,
        frac_models,
        floor = True,
        sites = sites_to_show,
        ax = axes[index]
    )

# Make output dir if doesn't exist
if not os.path.exists(escape_plots_dir + "subset_analysis_plots/"):
    os.mkdir(escape_plots_dir + "subset_analysis_plots/")
    
# Save figure    
fig.savefig(escape_plots_dir + "subset_analysis_plots/" + "GPC-A_hybrid_antibodies_escape_plot.svg")
Escape sites to show for GPC-A hybrid antibodies
[129, 133, 134, 135, 139, 140, 142, 143, 147, 148, 149, 153, 283, 156, 157, 160, 74, 76, 77, 79, 88, 120, 224, 98, 227, 228, 229, 236, 367, 240, 248, 250, 123, 253, 254, 127]
No description has been provided for this image

Escape summary plot for GPC-C hybrid antibodies

InĀ [9]:
# Flatten list of escape sites
escape_sites = [x for xs in GPC_C_hybrid_escape_sites for x in xs]

# Combined top sites from each antibody
sites_to_show = list(set(escape_sites))
print("Escape sites to show for GPC-C hybrid antibodies")
print(sites_to_show)

# Set figure size and subplots
fig, axes = plt.subplots(
    2, 
    3, 
    gridspec_kw={"width_ratios":[1,4,0.1]},
    figsize=(16, 6), 
    sharex="col",
)

# Adjust spacing of subplots
fig.subplots_adjust(
    wspace=0.25, 
    hspace=0.35,
    top=0.8,
    bottom=0.3,
)

for index,antibody in enumerate(GPC_C_hybrid_antibodies):
    get_filtered_csv_and_logo_plot(
        antibody,
        func_scores,
        times_seen,
        cell_entry_default,
        n_selections,
        frac_models,
        floor = True,
        sites = sites_to_show,
        ax = axes[index]
    )

# Make output dir if doesn't exist
if not os.path.exists(escape_plots_dir + "subset_analysis_plots/"):
    os.mkdir(escape_plots_dir + "subset_analysis_plots/")
    
# Save figure    
fig.savefig(escape_plots_dir + "subset_analysis_plots/" + "GPC-C_hybrid_antibodies_escape_plot.svg")
Escape sites to show for GPC-C hybrid antibodies
[129, 257, 131, 134, 135, 139, 142, 143, 144, 273, 398, 147, 148, 149, 152, 153, 160, 166, 325, 334, 337, 338, 343, 120, 119, 248, 121, 250, 123, 252, 253, 254, 127]
No description has been provided for this image

Get filtered CSV for functional effects

InĀ [10]:
# Read CSV file
func_scores = pd.read_csv(func_scores)

# Filter for minimum selections, times seen
func_scores = (
    func_scores.loc[
        (
            (func_scores["n_selections"] >= n_selections)
            &
            (func_scores["times_seen"] >= times_seen)
        )
        |
        (func_scores["wildtype"] == func_scores["mutant"])
    ]
    .drop(columns=[
        "effect_std",
        "times_seen",
        "n_selections",
    ])
    .reset_index(drop=True)
)

# Save CSV file
func_scores.to_csv(filtered_func_scores, index=False)